clear all
use "../data/combined general household survey.dta", clear // raw data from Oreopoulus (2008)

gen yearat14 = yobirth+14 // calendar year at age 14
gen ageat47 = 47-yobirth // age at calendar year 1947
gen drop14 = ageat47>=15 // pre-reform cohort
gen drop15 = (drop14==0) // post-reform cohort

gen learn = log(earn) // log annual nominal labor earnings

keep if nireland==0 // British sample only
keep if yearat14>=35 & yearat14<=65 // turn age 14 b/w 1935 & 1965
keep if agelfted>=10 & agelfted~=. // years of education is large enough
keep if datyear<=98 // follow the year range in Oreopoulus (2006)
keep if !missing(learn)
// correct some erratic age reports
replace age = datyear - yobirth if abs(age-(datyear-yobirth))>1
keep if age<=64 // age 25-64 (everyone is already old enough in the first year of GHS)

// top-code
tab agelfted, m
gen yschl = min(agelfted-5,20) // years of schooling

tab datyear yschl

tabstat earn learn yschl agelfted age yobirth datyear, stat(mean sd min p1 p10 q p90 p99 max N) col(stat)

local poly
local poly_age
forvalues j=1/4{
	gen age_`j' = ( (age-45)/20 )^`j'
	gen yearat14_`j' = ( (yearat14-50)/15 )^`j'
	local poly `poly' yearat14_`j'
	local poly_age `poly_age' age_`j'
}

gen nobs=1
local vlist learn drop15 `poly' `poly_age'
local vlist_sd
foreach v in `vlist'{
	local vlist_sd `vlist_sd' `v'_sd=`v'
}
collapse (sum) nobs (mean) `vlist' (sd) `vlist_sd', by(agelfted yearat14 datyear age)
// make sure age and cohort variables are degenerate within cell
foreach v in `vlist'{
	sum `v'_sd [fw=nobs]
}
sum nobs, d

compress
save ../processed/GHS_aggr.dta, replace
	
// clean Northern Irish data below
clear all
use "../data/combined general household survey.dta", clear // raw data from Oreopoulus (2008)

gen yearat14 = yobirth+14 // calendar year at age 14
gen ageat57 = 57-yobirth // age at calendar year 1957
gen drop14 = ageat57>=15 // pre-reform cohort
gen drop15 = (drop14==0) // post-reform cohort

gen learn = log(earn) // log annual nominal labor earnings

keep if nireland==1 // North Ireland sample only
keep if yearat14>=35 & yearat14<=65 // turn age 14 b/w 1935 & 1965
keep if agelfted>=10 & agelfted~=. // years of education is large enough
keep if datyear<=98 // follow the year range in Oreopoulus (2006)
keep if !missing(learn)
// correct some erratic age reports
replace age = datyear - yobirth if abs(age-(datyear-yobirth))>1
keep if age<=64 // age 25-64 (everyone is already old enough in the first year of GHS)

// top-code
tab agelfted, m
gen yschl = min(agelfted-5,20) // years of schooling

tab datyear yschl

tabstat earn learn yschl agelfted age yobirth datyear, stat(mean sd min p1 p10 q p90 p99 max N) col(stat)

local poly
local poly_age
forvalues j=1/4{
	gen age_`j' = ( (age-45)/20 )^`j'
	gen yearat14_`j' = ( (yearat14-50)/15 )^`j'
	local poly `poly' yearat14_`j'
	local poly_age `poly_age' age_`j'
}

gen nobs=1
local vlist learn drop15 `poly' `poly_age'
local vlist_sd
foreach v in `vlist'{
	local vlist_sd `vlist_sd' `v'_sd=`v'
}
collapse (sum) nobs (mean) `vlist' (sd) `vlist_sd', by(agelfted yearat14 datyear age)
// make sure age and cohort variables are degenerate within cell
foreach v in `vlist'{
	sum `v'_sd [fw=nobs]
}
sum nobs, d

compress
save ../processed/GHS_aggr_NI.dta, replace
